1. Total Adult Population
total_adults <- sum(data$fweight)
data.frame(
Metric = "Total Adults",
Value = format(total_adults, big.mark = ",")
) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Metric
|
Value
|
|
Total Adults
|
142,497.1
|
2. Gender Distribution
gender_dist <- data %>%
group_by(sex) %>%
summarize(Population = round(sum(fweight))) %>% # Rounded to integers
mutate(Percentage = Population / sum(Population) * 100)
plot_ly(gender_dist, labels = ~sex, values = ~Population, type = 'pie',
hole = 0.6,
marker = list(colors = viridis(2))) %>%
layout(
title = list(text = "Gender Distribution ", y = 0.6, x=0.46),
showlegend = TRUE,
annotations = list(
text = paste0("Total: ", format(round(sum(gender_dist$Population)), big.mark = ",")),
showarrow = FALSE,
font = list(size = 20)
)
)
gender_dist %>%
kbl(col.names = c("Gender", "Population", "Percentage (%)")) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Gender
|
Population
|
Percentage (%)
|
|
Female
|
72190
|
50.66072
|
|
Male
|
70307
|
49.33928
|
3. Adults in Each Province
province_dist <- data %>%
group_by(province) %>%
summarize(Population = sum(fweight)) %>%
mutate(Percentage = round(Population / sum(Population) * 100, 2))
province_dist %>%
kbl(col.names = c("Province", "Population", "Percentage (%)")) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Province
|
Population
|
Percentage (%)
|
|
A
|
5152.921
|
3.62
|
|
B
|
20400.948
|
14.32
|
|
C
|
16153.780
|
11.34
|
|
D
|
20061.399
|
14.08
|
|
E
|
24588.934
|
17.26
|
|
F
|
16328.008
|
11.46
|
|
G
|
8157.787
|
5.72
|
|
H
|
31653.346
|
22.21
|
4. Proportion of Employed Adults
employment_rate <- data %>%
summarize(
Employed = sum(fweight[employed == 1]),
Total = sum(fweight)
) %>%
mutate(Proportion_Employed = round(Employed / Total * 100, 2))
data.frame(
Metric = c("Employed Adults", "Total Adults", "Proportion Employed (%)"),
Value = c(
format(employment_rate$Employed, big.mark = ","),
format(employment_rate$Total, big.mark = ","),
paste0(employment_rate$Proportion_Employed, "%")
)
) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Metric
|
Value
|
|
Employed Adults
|
75,808.6
|
|
Total Adults
|
142,497.1
|
|
Proportion Employed (%)
|
53.2%
|
5. Mean Income by Gender and Province
mean_income_data <- data %>%
group_by(province, sex) %>%
summarize(
Mean_Income = mean(income, na.rm = TRUE),
SE = sd(income, na.rm = TRUE) / sqrt(n())
)
mean_income_data %>%
kbl(col.names = c("Province", "Gender", "Mean Income", "Standard Error")) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Province
|
Gender
|
Mean Income
|
Standard Error
|
|
A
|
Female
|
1972.0222
|
226.81528
|
|
A
|
Male
|
4881.2430
|
289.53311
|
|
B
|
Female
|
2622.5852
|
285.44389
|
|
B
|
Male
|
6788.1077
|
454.30157
|
|
C
|
Female
|
2135.3495
|
253.87328
|
|
C
|
Male
|
6460.1794
|
412.93693
|
|
D
|
Female
|
2018.0339
|
185.19456
|
|
D
|
Male
|
7798.1606
|
659.62649
|
|
E
|
Female
|
1261.1667
|
140.23414
|
|
E
|
Male
|
4449.4340
|
273.94793
|
|
F
|
Female
|
3902.1069
|
492.33321
|
|
F
|
Male
|
9763.6917
|
667.46453
|
|
G
|
Female
|
915.1712
|
74.58427
|
|
G
|
Male
|
4440.5176
|
420.38467
|
|
H
|
Female
|
1623.9482
|
161.50561
|
|
H
|
Male
|
8172.4702
|
743.30229
|
6. Income Distribution by Employment Status and Gender
ggplot(data, aes(x = sex, y = income, fill = interaction(employed, sex))) +
geom_violin(alpha = 0.7, scale = "width", trim = FALSE) +
geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.6) +
scale_y_log10() +
scale_fill_viridis_d() +
labs(
title = "Income Distribution by Employment Status and Gender",
x = "Gender",
y = "Income (Log Scale)"
) +
facet_wrap(~ employed, labeller = labeller(employed = c(`0` = "Unemployed", `1` = "Employed"))) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
strip.text = element_text(size = 12)
)

7. Gini Coefficient
# Calculate Gini Coefficient
gini_coefficient <- function(income, weights) {
sorted_indices <- order(income)
sorted_income <- income[sorted_indices]
sorted_weights <- weights[sorted_indices]
cum_weights <- cumsum(sorted_weights) / sum(sorted_weights)
cum_income <- cumsum(sorted_income * sorted_weights) / sum(sorted_income * sorted_weights)
gini <- 1 - sum((cum_income[-length(cum_income)] + cum_income[-1]) * diff(cum_weights))
return(gini)
}
gini <- gini_coefficient(data$income, data$fweight)
# Output Gini Coefficient
data.frame(
Metric = "Gini Coefficient",
Value = round(gini, 3)
) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Metric
|
Value
|
|
Gini Coefficient
|
0.75
|
8. Confidence Interval for Female Mean Income
# Filter data for females
female_data <- data %>% filter(sex == "Female")
# Calculate weighted mean and standard error
weighted_mean <- sum(female_data$income * female_data$fweight) / sum(female_data$fweight)
weighted_variance <- sum(((female_data$income - weighted_mean)^2) * female_data$fweight) / sum(female_data$fweight)
weighted_se <- sqrt(weighted_variance / nrow(female_data))
# 95% confidence interval
ci_lower <- weighted_mean - qnorm(0.975) * weighted_se
ci_upper <- weighted_mean + qnorm(0.975) * weighted_se
data.frame(
Metric = c("Weighted Mean Income", "Lower 95% CI", "Upper 95% CI"),
Value = c(
round(weighted_mean, 2),
round(ci_lower, 2),
round(ci_upper, 2)
)
) %>%
kbl() %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center"
)
|
Metric
|
Value
|
|
Weighted Mean Income
|
2063.91
|
|
Lower 95% CI
|
1885.17
|
|
Upper 95% CI
|
2242.65
|
9. Factors to Improve Confidence Interval
To improve the confidence interval:
Increase the sample size to reduce variability.
Ensure accurate weighting to reflect the population.
Account for any missing or outlier data points.
Consider stratified sampling to improve representativeness across
groups.